# This is the pre-processing script used as part of the text analysis component. It includes the following steps:
# * lemmatizing the Hebrew open responses (using YAP and its python wrapper yapwrapper)
# * identifying bigram phrases (using gensim)
# * running ElasticNet regression on the document-term matrix and the thermometer scores (using sklearn)
# * translating Hebrew tokens into English (using googletrans)
# * identifying sentiment of each word (using the SentiWordNet lexicon)

import pandas as pd
import sys
from yapwrapper.yap_api import YapApi
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV
import os
from tqdm import tqdm_notebook as tqdm
from gensim.models.phrases import Phrases, Phraser
from collections import Counter
from googletrans import Translator
import time
import numpy as np
from nltk.stem import WordNetLemmatizer

### load raw textual data file
df = pd.read_csv("/home/etherx/traits.csv")
df = df.dropna()

# helper function to lemmatize text with yapwrapper
# %%
def lemmatize_text(text):    
    ip='127.0.0.1:8000'
    yap=YapApi()    
    tokenized_text, segmented_text, lemmas, dep_tree, md_lattice, ma_lattice=yap.run(text, ip)
    if lemmas != "":
        return lemmas
    else:
        print(f"No lemmas detected! returning {text}")
        return text

# lemmatize open responses
traits_likud = [lemmatize_text(x) for x in tqdm(df["traits_likud"].to_list())]
traits_kahol_lavan = [lemmatize_text(x) for x in tqdm(df["traits_kahol_lavan"].to_list())]
traits_avoda = [lemmatize_text(x) for x in tqdm(df["traits_avoda"].to_list())]
traits_aguda = [lemmatize_text(x) for x in tqdm(df["traits_aguda"].to_list())]
traits_joint_list = [lemmatize_text(x) for x in tqdm(df["traits_joint_list"].to_list())]

# aggregate all tokens
alltokens = []
alltraits = traits_likud + traits_kahol_lavan + traits_avoda + traits_aguda + traits_joint_list

for x in alltraits:
    alltokens.extend(x.split(" "))

# generate dataframe of tokens by frequency
c = Counter(alltokens)
cmc = c.most_common()
tokdf = pd.DataFrame({"val": [x[0] for x in cmc], "token": [x[1] for x in cmc]})

# save and reload after manual correction of lemmatization
tokdf.to_csv("tokens.csv", index=False)
# reload corrected file
tokdf = pd.read_csv("tokens.csv")
tokdf = tokdf.dropna()

# create dictionary for applying corrections
tokdict = {k: v for k, v in zip(tokdf["val"], tokdf["newval"])}

# generate data for Phrases model
alltokens_ph = []
for i in alltraits:
    tmp = []
    i = i.split(" ")
    if len(i) > 1:
        for x in i:
            if x in tokdict:
                tmp.append(tokdict[x])
            else:
                tmp.append(x)
    else:
        if x in tokdict:
            tmp.append(tokdict[x])
        else:
            tmp.append(x)
    alltokens_ph.append(tmp)
                

# run phrases model
phrases = Phrases(alltokens_ph)

# save
phrases.save("phrases_model.obj")

# helper function
def proc(x):
    x = x.split(" ")
    newx = []
    for i in x:
        if i in tokdict:
            newx.append(tokdict[i])
        else:
            newx.append(i)
    return newx    


# apply manual fixed and Phrase detection to lemmatized reponses
df["traits_likud_proc"] = phrases[[proc(x) for x in traits_likud]]
df["traits_kahol_lavan_proc"] = phrases[[proc(x) for x in traits_kahol_lavan]]
df["traits_avoda_proc"] = phrases[[proc(x) for x in traits_avoda]]
df["traits_aguda_proc"] = phrases[[proc(x) for x in traits_aguda]]
df["traits_joint_list_proc"] = phrases[[proc(x) for x in traits_joint_list]]

# vectorize and fit model per party
vectorizer = CountVectorizer()

# helper function for model training
def fit_model_and_save(namex, namey, partyname):
    
    x = vectorizer.fit_transform([" ".join(x) for x in df[namex]])
    reg = ElasticNetCV(max_iter=15000)
    reg.fit(x, df[namey])
    wordweights = pd.DataFrame({"feature": vectorizer.get_feature_names(), 
                                "weight": reg.coef_, 
                                "absweight": np.abs(reg.coef_)})
    wordweights = wordweights.sort_values(by="absweight", ascending=False)
    wordweights.to_csv(f"{partyname}_elasticnet_predictivewords.csv")
    return wordweights

# run ElasticNet regression for each party with its corresponding thermometer score variable
likud = fit_model_and_save("traits_likud_proc", "w7th_1", "likud")
kahol_lavan = fit_model_and_save("traits_kahol_lavan_proc", "w7th_2", "kahol_lavan")
avoda = fit_model_and_save("traits_avoda_proc", "w7th_3", "avoda")
aguda = fit_model_and_save("traits_aguda_proc", "w7th_7", "aguda")
joint_list = fit_model_and_save("traits_joint_list_proc", "w7th_9", "joint_list")

# aggregate output
allfeatures = pd.concat([likud, kahol_lavan, avoda, aguda, joint_list])
allfeatures = allfeatures.drop_duplicates(subset=["feature"])[["feature"]]

## Translate features to English
translator = Translator()
word = allfeatures["feature"].to_list()
english = []
i = 0

while len(english) < len(word):
    try:
        english.append(translator.translate(word[len(english)]))
    except:
        print(f"Error on number {len(english)}, sleeping...")
        time.sleep(1)

# %%
allfeatures["english"] = [x.text for x in english]

# save and reload after manual fixes to translation
# %%
allfeatures.to_csv("translated_features.csv")
allfeatures_fix = pd.read_csv("translated_features_manual_fix.csv")

# add sentiment to fixed feature list
# load sentiwords
sw = pd.read_csv("SentiWords_1.1.txt", sep="\t", skiprows=25)
sw.columns = ["word", "polarity"]
# load wordnet lemmatizer
wnl = WordNetLemmatizer()

# helper function for extracting sentiment score
def proc_word(word):
    if  type(word) != type(str()) or len(word) == 0:
        return(0)
    
    if " " in word:
        word = word.replace(" ", "_")
    if "'" in word:
        word = word.replace("'", "")
    
    word = word.lower()
    first = sw.query(f"word.str.contains('^{word}#')")
    second = sw.query(f"word.str.contains('^{word}')")

    if first.shape[0] != 0:
        return(first["polarity"].mean())
    elif second.shape[0] != 0:
        return(second["polarity"].mean())
    else:
        lemma = wnl.lemmatize(word)
        third = sw.query(f"word.str.contains('^{lemma}')")
        if third.shape[0] != 0:
            #print(third)
            return(third["polarity"].mean())
        else:
            return(0)


# process all features
allfeatures_fix["sentiwords"] = [proc_word(word) for word in tqdm(allfeatures_fix["english"])]

# lowercase
newenglish = []
for x in allfeatures_fix["english"]:
    if type(x) == type(str()):
        newenglish.append(x.lower())
    else:
        newenglish.append(x)

allfeatures_fix["english"] = newenglish


# add sentiment weight to party prediction files

def add(df, savename):
    df = df.merge(allfeatures_fix, on="feature", how="left")
    df.to_csv(f"{savename}_elasticnet_predictivewords_w_sentiment.csv")
    return (df)

# produce processed data files for analysis

likud = add(likud, "likud")
kahol_lavan = add(kahol_lavan, "kahol_lavan")
avoda = add(avoda, "avoda")
aguda = add(aguda, "aguda")
joint_list = add(joint_list, "joint_list")
